from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, IntegerType, StringType

if __name__ == '__main__':
    # 构建SparkSession对象
    spark = SparkSession.builder. \
        appName("local[*]"). \
        config("spark.sql.shuffle.partitions", "4"). \
        getOrCreate()
    # appName 设置程序名称
    # config 设置常用属性。可以通过此来设置配置
    # 最后通过getOrCreate 创建 SparkSession对象

    # 从SparkSession中获取SparkContext
    sc = spark.sparkContext

    # parquet文件也不需要指定schema,因为自带
    df = spark.read.format("parquet") .load("../../data/sql/users.parquet")

    df.printSchema()
    df.show(truncate=False)

    # 打印结果为：
    # +------+--------------+----------------+
    # |name  |favorite_color|favorite_numbers|
    # +------+--------------+----------------+
    # |Alyssa|null          |[3, 9, 15, 20]  |
    # |Ben   |red           |[]              |
    # +------+--------------+----------------+
